
pacman::p_load(tidyverse, 
               yardstick, 
               tictoc,
               dtplyr,
               ggthemes,
               ggrepel,
               fst,
               dtplyr,
               lubridate
)


file_name_fitted_merged <- paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, 
                                  "/", "fitted_merged_training_", TRAINING_SUFFIX, "_", 
                                  RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/", 
                                  "training_fitted_CRA.fst")


path_threshold_inputs <- paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX,
                   "/", "threshold_inputs_", TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/")

tic()
df_raw <- read_fst(file_name_fitted_merged)
toc()

df <- df_raw %>% 
  filter(!is.na(xgb) & !is.na(logistic) & !is.na(riskscore)) %>% 
  mutate(
    t_non_default = 1 - as.numeric(as.character(t_default)),
    t_non_default = factor(t_non_default, levels = c(0, 1)),
    value_riskscore = riskscore,
    value_xgb = xgb * -1,
    value_logistic = logistic * -1,
    d_Income_Level = case_when(Income_Level %in% 1:2 ~ 0,
                               Income_Level %in% 3:4 ~ 1,
                               TRUE ~ NA_real_)
  ) %>% 
  filter(!is.na(d_Income_Level)) %>% 
  select(-Income_Level) %>% 
  glimpse()

pct_bins_ <- 200

v_quantiles_riskscore <- quantile(df$value_riskscore, seq(0, 1, 1/pct_bins_), names = FALSE)
v_quantiles_xgb <- quantile(df$value_xgb , seq(0, 1, 1/pct_bins_), names = FALSE)
v_quantiles_logistic <- quantile(df$value_logistic , seq(0, 1, 1/pct_bins_), names = FALSE)

df_pct_riskscore <- tibble(
  value_type_ = "riskscore",
  pct = 0:pct_bins_,
  value = v_quantiles_riskscore
) %>%
  group_by(value) %>% 
  filter(row_number() == n()) %>% 
  ungroup() %>% 
  glimpse()

df_pct_xgb <- tibble(
  value_type_ = "xgb",
  pct = 0:pct_bins_,
  value = v_quantiles_xgb
) %>% 
  group_by(value) %>% 
  filter(row_number() == n()) %>% 
  ungroup() %>% 
  glimpse()

df_pct_logistic <- tibble(
  value_type_ = "logistic",
  pct = 0:pct_bins_,
  value = v_quantiles_logistic
) %>% 
  group_by(value) %>% 
  filter(row_number() == n()) %>% 
  ungroup() %>% 
  glimpse()

df_pct <- df_pct_riskscore %>% 
  bind_rows(df_pct_xgb) %>% 
  bind_rows(df_pct_logistic) %>% 
  glimpse()

df_cut <- df %>% 
  mutate(
    value_riskscore = as.numeric(as.character(cut(value_riskscore, df_pct_riskscore$value, df_pct_riskscore$pct[2:length(df_pct_riskscore$pct)], include_lowest = TRUE))),
    value_xgb = as.numeric(as.character(cut(value_xgb, df_pct_xgb$value, df_pct_xgb$pct[2:length(df_pct_xgb$pct)], include_lowest = TRUE))),
    value_logistic = as.numeric(as.character(cut(value_logistic, df_pct_logistic$value, df_pct_logistic$pct[2:length(df_pct_logistic$pct)], include_lowest = TRUE)))
  ) %>% 
  filter(!is.na(value_riskscore), !is.na(value_xgb), !is.na(value_logistic)) %>% 
  select(cid, qtr, t_non_default, d_Income_Level, value_riskscore, value_xgb, value_logistic) %>% 
  pivot_longer(cols = c(value_riskscore, value_xgb, value_logistic), names_to = "value_type", values_to = "value") %>% 
  glimpse()

df_thresh <- tibble()

#v_thresh <- seq(0, 1, 1/100)
v_thresh <- 0:pct_bins_
tic()
for (i in 1:length(v_thresh)) { 
  tic()
  #i <- 10
  thresh_ <- v_thresh[i]
  #thresh_ <- 0.5
  df_thresh_add <- df_cut %>% 
    mutate(
      t_non_default_pred = as.numeric(value > thresh_)
    ) %>% 
    group_by(d_Income_Level, value_type) %>% 
    summarize(
      tp = sum((t_non_default == "1") & (t_non_default_pred == "1")),
      fp = sum((t_non_default == "0") & (t_non_default_pred == "1")),
      tn = sum((t_non_default == "0") & (t_non_default_pred == "0")),
      fn = sum((t_non_default == "1") & (t_non_default_pred == "0")),
      
      .groups = "drop"
    ) %>% 
    mutate(
      profit = tp - 4 * fp,
      tpr = tp / (tp + fn),
      fpr = fp / (fp + tn),
      threshold = thresh_
    ) 
  
  df_thresh <- df_thresh %>% 
    bind_rows(df_thresh_add) %>% 
    filter(!is.na(d_Income_Level))
  print(i)
  toc()
}
toc()

################################################

df_roc <- df_thresh %>% 
  arrange(value_type, d_Income_Level, -threshold) %>% 
  glimpse()

df_base_rate_all <- df %>% 
  summarize(
    pct = sum(t_non_default == "1") / n()
  ) %>%
  mutate(
    group = "Total"
  ) %>% 
  print()

df_base_rates <- df %>%
  group_by(d_Income_Level) %>% 
  summarize(
    pct = sum(t_non_default == "1") / n()
  ) %>%
  bind_rows(df_base_rate_all) %>% 
  mutate(
    group = case_when(d_Income_Level == 1 ~ "Majority",
                      d_Income_Level == 0 ~ "Minority",
                      group == "Total" ~ "Total",
                      TRUE ~ NA_character_)
  ) %>% 
  glimpse()

df_proportions <- df %>%
  group_by(d_Income_Level) %>% 
  summarize(
    n = n()
  ) %>%
  ungroup() %>% 
  mutate(
    pct = n / sum(n),
    group = case_when(d_Income_Level == 1 ~ "Majority",
                      d_Income_Level == 0 ~ "Minority",
                      TRUE ~ NA_character_)
  ) %>% 
  glimpse()


file_name_out <- paste0(path_threshold_inputs, "roc.csv")
write_csv(df_roc, file_name_out)

file_name_out <- paste0(path_threshold_inputs, "base_rates.csv")
write_csv(df_base_rates, file_name_out)

file_name_out <- paste0(path_threshold_inputs, "proportions.csv")
write_csv(df_proportions, file_name_out)

file_name_out <- paste0(path_threshold_inputs, "percentiles.csv")
write_csv(df_pct, file_name_out)
